org 100h   ; assume ax=bx=0 ch=0

  pop di   ; di=sp=0

;Prepare floating-point constants for SSE
;[0xfff0]=0xffe00000, [0xffe0]=0xffc00000, ... [0x8000]=0
;step 0x00200000: ... 1 1.25 1.5 1.75  2 2.5 3 3.5  4 5 6 7 ...
PK:
  mov cl,4
  sub ax,0x20
PKL:
  push ax  ; x
  push bx  ; 0
  loop PKL ; store four times
  jnz PK   ; loop 2048 times -> 32kB; sp=0x8000


  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;      ;Palette test
;      SEGMENTS equ 16
;      XRUN     equ 6
;      YRUN     equ 6
;      START    equ 0
;
;      %if YRUN<256
;        %define mov__cx_YRUN mov cl,YRUN
;      %else
;        %define mov__cx_YRUN mov cx,YRUN
;      %endif
;
;      %if XRUN<256
;        %define mov__cx_XRUN mov cl,XRUN
;      %else
;        %define mov__cx_XRUN mov cx,XRUN
;      %endif
;
;        pusha
;        salc
;        scasw
;      PW mov__cx_YRUN
;      PY mov bx,SEGMENTS
;        pusha
;        add al,START
;      PX mov__cx_XRUN
;        rep stosb
;        inc ax
;        dec bx
;        jnz PX
;        popa
;        add di,320
;        loop PY
;      %if SEGMENTS<256
;        add al,SEGMENTS
;        jnc PW
;      %endif
;        popa

;Palette: Luminance * Hue: diffuse = L*[0.2,H,1], specular = L^9 / 2
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
PAL:
;Color

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = H  navy green gold [A]
;  mov al,0x40
;  sub al,bl
;  push ax
;  imul ax,bx,2
;  push ax
;  mov ax,bx
;  sub al,48+128
;  imul al
;  shr ax,5
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = (H/2 + 0.5)   pink blue green yellow [B]
;  mov al,0x80
;  sub al,bl
;  push ax      ; blue = 1 - H/2
;  push bx
;  imul bl
;  shr ax,4
;  not ax
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = (H/2 + 0.5)  blue green yellow [C]
;  mov al,0x80
;  sub al,bl
;  push ax      ; blue = 1 - H/2
;  push bx      ; green = 0.5 + H/2
;  imul al      ; ax = H*H/4
;  shr ax,6
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = (H/2 + 0.5)  blue green yellow [C2]
;  mov al,0x40
;  sub al,bl
;  push ax      ; blue = 1 - H/2
;  push bx      ; green = 0.5 + H/2
;  add al,0x40
;  imul al      ; ax = H*H/4
;  shr ax,6
;  push ax

;  or bx,0b0000111111000011  ; bx = LLLL1111 11HHHH11  ; bl = (H/4 + 0.75)  blue green yellow [D]
;  mov al,0x80
;  sub al,bl
;  push ax      ; blue = 0.75 - H/4
;  imul ax,-2
;  push ax      ; green = 0.5 + H/2
;  sub al,0x80
;  mul al       ; ax = H*H/4
;  shr ax,6
;  push ax

  or bx,0b0000111111000011  ; bx = LLLL1111 11HHHH11  ; bl = (H/4 + 0.75)  navy cyan gold [E]
  mov al,0x80
  sub al,bl
  push ax      ; blue = 0.75 - 0.25*H
  imul ax,bx,3
  push ax      ; green = 0.25 + 0.75*H
  add al,bl
  mul al       ; red = H*H
  mov al,ah
  push ax

;  or bx,0b0000111111000011  ; bx = LLLL1111 11HHHH11  ; bl = (H/4 + 0.75)  navy cyan gold [E2]
;  mov al,0x80
;  sub al,bl
;  push ax      ; blue = 0.75 - H/4
;  imul ax,bx,3
;  push ax      ; green = 0.25 + H*3/4
;  mul al       ; red = 1/16 + H*3/8 + H*H*9/16
;  mov al,ah
;  push ax

;  or bx,0b0000111111000011  ; bx = LLLL1111 11HHHH11  ; bl = (H/4 + 0.75)  navy cyan gold [E3]
;  mov al,0x80
;  sub al,bl
;  push ax      ; blue = 0.75 - H/4
;  imul ax,bx,4
;  push ax      ; green = H
;  mul al       ; red = H*H
;  mov al,ah
;  push ax

;  or bx,0b0000111100001111  ; bx = LLLL1111 HHHH1111  pink blue green yellow [F]
;  imul ax,bx,-1
;  push ax    ; b=1-H
;  push bx    ; g=H
;  mul bl
;  shr ax,6
;  not al
;  push ax    ; g=H*(1-H)/8

;  or bx,0b0000111100001111  ; bx = LLLL1111 HHHH1111  cyan yellow [G]
;  imul ax,bx,-1
;  push ax    ; b=1-H
;  push dx    ; g=0.78
;  push bx    ; r=H

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  blue gold [H]
;  mov al,0x80
;  sub al,bl
;  push ax
;  push bx
;  imul ax,bx,2
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  cyan yellow [H2]
;  mov al,0x40
;  sub al,bl      ; blue = 0.75 - H/2
;  push ax
;  push bx        ; green = 0.5 + H/2
;  imul ax,bx,2
;  push ax        ; red = H

;  or bx,0b0000111100001111  ; bx = LLLL1111 HHHH1111  navy cyan [I]
;  push dx    ; b=0.78
;  push bx    ; g=H
;  push ax    ; r=last blue output (0..0.25)

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = (H/2 + 0.5)  cyan gold yellow [J]
;  imul ax,bx,-1
;  push ax      ; blue = H/2
;  push bx      ; green = 0.5 + H/2
;  imul bl      ; ax = H*H
;  shr ax,6
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 1HHHH111  ; bl = (H/2 + 0.5)  navy gold [K]
;  mov al,0x40
;  sub al,bl
;  push ax      ; blue = 0.75 - H/2
;  not al
;  push ax      ; green = 0.25 + H/2
;  sub al,0x40
;  imul al      ; ax = H*H
;  shr ax,6
;  push ax

;  or bx,0b0000111110000111  ; bx = LLLL1111 HHHH1111  ; bl = (H/2 + 0.5)  cyan green gold [L]
;  imul ax,bx,-1
;  push ax      ; blue = 0.75 - H/2
;  push bx      ; green = 0.25 + H/2
;  mul bl       ; ax = H*H
;  shr ax,6
;  not ax
;  push ax

;Specular
  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  mov cl,ah
  shr cl,1   ; cl=L^8/2 (0..127)

;Diffuse, add with saturation
MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc SAT
  salc       ; clamp to 0..255
SAT:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz PAL

;      ;Palette test
;        xor ax,ax  ; wait for a key
;        int 16h
;        mov ax,3   ; textmode
;        int 10h
;        ret

%define K(x) 0x8000 + 0x10*(x/0x20)
%define K_TRANSLATION K(0x3e80)  ; 0.25
%define K_TIME_DELTA  K(0x3c80)  ; 1/64
%define K_EPS         K(0x3ca0)  ; 0.01953125 = 20/1024
%define K_LIGHT_SCALE K(0x4440)  ; 768 = 15/EPS
%define K_HUE_SCALE   K(0x41c0)  ; 24 = 16*1.5
%define K_NEG_ABS     K(0x8000)  ; -0 = 0x80000000 for -abs()
%define K_MINUS1      K(0xbf80)  ; -1
%define K_0           K(0x0000)  ; 0
%define K_1           K(0x3f80)  ; 1

;For 16:9 screens: pixel aspect ratio = 1.03
%define K_X_SCALE     K(0x3020)  ; 2.5 * 2**-32: x -> ..1.25
%define K_Y_SCALE     K(0x2fe0)  ; 1.75 * 2**-32: y -> ..0.6836

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE     K(0x3000)  ; 2.0 * 2**-32: x -> ..1.0
;%define K_Y_SCALE     K(0x3000)  ; 2.0 * 2**-32: y -> ..0.7813

  fild dword[si]   ;| t=t0

;For each frame: prepare rotation constants
M fadd dword[K_TIME_DELTA] ;| t+=dt
  fld st0
  fsincos          ;| C1 S1 t
  fldln2
  fmul st3         ;| 0.69315*t C1 S1 t
  fsincos          ;| C2 S2 C1 S1 t
  fldlg2
  fmul st5         ;| 0.30103*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t

;Store each constant four times
  mov bx,0x2020    ; bh=0x20
STORE:
  mov cl,4
STORE4:
  fst dword[bx]    ;0xa000 10 20 30 40 50 60 70 80
  add bl,4         ;    XY    C3 S3 C2 S2 C1 S1 scratch
  loop STORE4
  fstp st0
  jns STORE        ; loop 4 times: bx=0xa080

%define COS bx
%define SIN bx+0x10

;For each pixel: store x,y coordinates
X mov bl,0
  mov cl,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bl,4
  inc di
  loop X4      ; di+=4 bx=0x2010

%define INT_X bx-1  ; x = 2^32 * (-0.5..0.5)
%define INT_Y bx    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

%define x xmm0 ; XYZ coordinates for iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [c,c/4,0]
%define d xmm7 ; depth

;Trace steps along a ray
  mov cl,24
  movaps d,[K_MINUS1]; d=-1
  xorps a,a
T addps d,a     ; d+=map(X,Y,d)
  call MAP
  loop T

;Normal, ambient occlusion
  movaps [bx],a ; bx=0x2080
  subps d,[K_EPS]
  call MAP      ; a = map(X,Y,d-EPS)
  subps a,[bx]  ; a = map(X,Y,d-EPS) - map(X,Y,d)

;Depth fog: depth -1..0..1 -> color 1..1..0
;  movaps b,[K_1]
;  subps b,d
;  minps b,[K_1]
;  mulps a,b     ; a *= min(1-d,1)

;Color
  cmpltps d,[K_1]; d = d<1? 0xffffffff : 0
  andps a,d      ; a = d<1? a : 0

  mulps a,[K_LIGHT_SCALE]
  mulps o,[K_HUE_SCALE]
  cvtps2dq a,a
  cvtps2dq o,o
  pslld a,4
  paddd a,o     ; color index = (L<<4) + H
  packssdw a,a
  packuswb a,a  ; to 0..255
  movd [es:di-4],a

;Next pixel
  test di,di
  jnz X

;Esc test, next frame
  in al,0x60
  dec al
  jnz M   ; fallthrough

;Return the distance to the KIFS fractal
MAP:
  mov bl,0       ; bx=0x2000
  movups x,[INT_X]
  cvtdq2ps y,[INT_Y]
  cvtdq2ps x,x
  mulps y,[K_Y_SCALE]
  mulps x,[K_X_SCALE]
  movaps z,d    ; x,y,z = X,Y,depth
  xorps o,o     ; o=0
  movaps c,[K_TRANSLATION] ; c=1/4: translation=[c,c/4,0]
  mov ch,15     ; number of iterations

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20
R movaps b,[COS]; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,[SIN]
  mulps b,z     ; b=Cz
  mulps z,a     ; z=Sz
  mulps a,x     ; a=Sx
  mulps x,[COS] ; x=Cx
  subps a,b     ; a=x'=Sx-Cz
  addps z,x     ; z=z'=Sz+Cx
  movaps x,y    ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,bh     ; 0x20 | 0x40 | 0x60
  jns R         ; bx=0x2080 a=z

;Reflect along X and Y
  movaps b,[K_NEG_ABS]
  orps x,b      ; x=-|x|
  orps y,b      ; y=-|y|

;L_inf distance to [0,0,0]
  orps a,b      ; a=-|z|
  minps a,x
  minps a,y     ; a=-length = min(-|x|,-|y|,-|z|)

  xorps a,b     ; a=length

;Orbit trap
  maxps o,a     ; orbit=max(orbit,length)

;Translate by [c,c/4,0]
  movaps b,c
  mulps b,[K_TRANSLATION] ; b=c/4
  addps x,c     ; x+=c
  addps y,b     ; y+=c/4

;Scale translation
  subps c,b     ; c-=c/4 (c*=3/4)

;Next iteration
  dec ch
  jnz L

  subps a,c
  subps a,c     ; a=length-2c
  ret           ; bx=0x2080
